import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15, 9)
%matplotlib inline
import seaborn as sbn
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
#Misc.
import psycopg2
from typing import Union, Tuple, List, Dict, Any, Optional
from pmdarima.arima import ADFTest
from sklearn.metrics import mean_squared_error
from fastdtw import fastdtw
import pmdarima as pm
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import dtw
#Scipy - Clustering and Statistics
import scipy
import scipy.cluster.hierarchy as sch
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform, euclidean
#NetworkX - Graphs
import networkx as nx
import networkx
#RESTful APIs and Scraping
import requests
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import bs4 as bs
# NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import warnings
warnings.filterwarnings("ignore")
array_1d = Union[List, Tuple, pd.Series, np.ndarray]
date_obj = Union[datetime.datetime, datetime.date]
#Send a GET request to the Wikipedia API
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies') #Returns the html of the Wikipedia page for S&P500 companies
soup = bs.BeautifulSoup(resp.text, 'lxml') #Parse the webpage from Wikipedia with BeautifulSoup
table = soup.find('table', {'class': 'wikitable sortable'}) #Find the contents under the table tag
tickers = [] #Stores tickers
names = [] #Stores company names
industries = [] #Stores GICS Industry
for row in table.findAll('tr')[1:]: #Iterates through each row in the table excluding the header and extracts relevant data
tickers.append(row.findAll('td')[0].text)
names.append(row.findAll('td')[1].text)
industries.append(row.findAll('td')[3].text)
tickers = [s.replace('\n', '') for s in tickers] #delete newline characters in parsed data
stock_info = pd.DataFrame({'Ticker': tickers, 'Name': names, 'Sector': industries}) #Create Dataframe from Wikipedia data
stock_info = stock_info.replace('Communication Services\n','Communication Services') #Clean messy data
stock_info = stock_info.sort_values(by=['Ticker'], ignore_index = True) #Sort Alphabetically by Ticker
stock_info.head()
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
start_date = datetime.datetime(2019, 4, 1)
end_date = datetime.datetime(2019, 12, 1)
#The credentials for connecting to the database have been hidden in the anaconda environment
CONNECTION_PARAMS = {
'database': os.environ.get("DATABASE"),
'user': os.environ.get("USERNAME"),
'password': os.environ.get("PASSWORD"),
'host': os.environ.get("HOST"),
'port': os.environ.get("PORT"),
}
def execute_query(
query: str, params: array_1d = None, commit: bool = False
) -> Tuple:
"""Connects to the PostgreSQL database and executes a specified query.
This query should be specified as a string and can contain placeholders.
If placeholders are used, the second parameter should be a tuple of
values to substitute into the query string.
See the Psycopg2 documentation for more info:
https://www.psycopg.org/docs/usage.html
If the database will be modified by the query (i.e. INSERT) the third
commit parameter should be set to True.
Args:
query: The entire SQL query (can contain placeholders)
params: Values for each placeholder in the query
commit: Whether to commit the changes to the database
Returns:
A tuple pair consisting of a list of column names and the results of
the query. The results are a list of tuples, each corresponding to a
table row. If the request fails, returns None.
"""
col_names = None
results = None
with psycopg2.connect(**CONNECTION_PARAMS) as conn:
cur = conn.cursor()
cur.execute(query, params) # Execute query
# Get column names from response
col_names = [desc.name for desc in cur.description]
results = cur.fetchall() # Get queried data
if commit:
conn.commit()
cur.close()
if results:
return col_names, results
return None
def get_data(table_name: str, tickers: Optional[Union[str, array_1d]] = None,
start_date: Optional[date_obj] = None,
end_date: Optional[date_obj] = None,
attributes: Optional[array_1d] = None
) -> pd.DataFrame:
"""Gets data from either the equity or fundamental table.
Filtering by ticker, attributes (e.g. close price for equity, or p/e for
fundamental), or start/end dates are optional. If the ticker is a string,
only data for that ticker will be queried. If the ticker is a list/tuple
of strings, data for all the tickers will be queried. If no arguments are
specified, the entire equity table is returned.
Args:
table_name: Gets data from either the equity or fundamental table
tickers: Gets data for only this ticker or list of tickers.
attributes: Gets data for columns corresponding to list of attributes
start_date: Gets data from after this date (inclusive)
end_date: Gets data from before this date (inclusive)
Returns:
A dataframe of the requested table data.
If the database query fails, returns None.
"""
# Keeps track of SQL WHERE conditions
ticker_conds = []
date_conds = []
# Keeps track of placeholder values
query_params = []
# date column name
date_col_name = (
'date'
if (table_name == 'equity' or table_name == 'equity_raw')
else 'datekey'
)
# Add WHERE conditions for each ticker
if tickers:
if isinstance(tickers, (list, tuple)):
for t in tickers:
ticker_conds.append("ticker = %s")
query_params.append(t)
else:
ticker_conds.append("ticker = %s")
query_params.append(tickers)
# Add WHERE conditions for the start and end date
if start_date:
date_conds.append(date_col_name + " >= %s")
query_params.append(start_date)
if end_date:
date_conds.append(date_col_name + " <= %s")
query_params.append(end_date)
# Build the query and execute it
ticker_query = " OR ".join(ticker_conds)
date_query = " AND ".join(date_conds)
# Build attributes string
# TODO: Currently attributes can only be a list. Add functionality for sets,
# tuples etc. https://gitlab.com/sif_quant_team/siftools/-/issues/1
if not attributes:
attr_string = '*'
else:
if date_col_name not in attributes:
attributes = [date_col_name] + attributes
if 'ticker' not in attributes:
attributes = ['ticker'] + attributes
attr_string = ','.join(attributes)
# Construct WHERE clause for query string
query = f"SELECT {attr_string} FROM {table_name}"
if ticker_query and date_query:
query += f" WHERE ({ticker_query}) AND {date_query}"
elif ticker_query:
query += f" WHERE {ticker_query}"
elif date_query:
query += f" WHERE {date_query}"
# if table is fundmental_raw, we also need to filter on dimension
if table_name == 'fundamental_raw':
if not (ticker_query or date_query):
query += " WHERE dimension = \'ARQ\'"
else:
query += " AND dimension = \'ARQ\'"
query += ";"
cols, response = execute_query(query, params=query_params)
# If the query was successful, convert and return
if response:
df = pd.DataFrame(data=response, columns=cols)
df.drop_duplicates(
subset=['ticker', date_col_name], keep='first', inplace=True
)
return df
return None
def get_fundamental_data_day(
tickers: Optional[Union[str, array_1d]] = None,
date: Optional[date_obj] = None,
attributes: Optional[array_1d] = None,
day_threshold: int = 252,
table_name: str = 'fundamental_raw',
) -> pd.DataFrame:
"""get_fundamental_data, but gets the nearest fundamental data point for
a single day. This is the same as querying from a table with
forward-filled values (daily).
"""
# Keeps track of SQL WHERE conditions
ticker_conds = []
# Keeps track of placeholder values
query_params = []
# date column name
# Add WHERE conditions for each ticker
if tickers:
if isinstance(tickers, (list, tuple)):
for t in tickers:
ticker_conds.append("ticker = %s")
query_params.append(t)
else:
ticker_conds.append("ticker = %s")
query_params.append(tickers)
# Build ticker query
ticker_query = " OR ".join(ticker_conds)
ticker_query = f"AND ({ticker_query})" if ticker_query else ""
# Build attributes string
if not attributes:
attr_string = '*'
else:
if 'ticker' not in attributes:
attributes = ['ticker'] + attributes
if 'datekey' not in attributes:
attributes = ['datekey'] + attributes
attr_string = ','.join(attributes)
# Build full query
query = f"""WITH summary AS (
SELECT {attr_string},
ROW_NUMBER() OVER(PARTITION BY ticker
ORDER BY datekey DESC) AS rk
FROM {table_name}
where datekey <= '{date}'
{ticker_query}
AND dimension = \'ARQ\'
)
SELECT s.*
FROM summary s
WHERE s.rk = 1
;
"""
cols, response = execute_query(query, params=query_params)
if response:
df = pd.DataFrame(data=response, columns=cols)
df.drop('rk', axis=1, inplace=True)
date_diff = (date - pd.to_datetime(df['datekey'])).dt.days
df = df[date_diff <= day_threshold]
return df
return None
def get_fundamental_data(
tickers: Optional[Union[str, array_1d]] = None,
start_date: Optional[date_obj] = None,
end_date: Optional[date_obj] = None,
attributes: Optional[array_1d] = None,
raw: bool = True,
) -> Dict[str, pd.DataFrame]:
"""Gets data from fundamental table and returns a dictionary of DataFrame
keyed by attributes. Each of these DataFrame has tickers as columns and
rows as dates and values of the attribute for that day
If the ticker is a string, only data for that ticker will be queried
If the ticker is array1d of strings, data for all tickers will be queried
If no arguments are specified, data from the entire equity table is returned
Args:
tickers: Gets data for only this ticker or list of tickers
attributes: Gets data for columns corresponding to list of attributes
start_date: Gets data from after this date (inclusive)
end_date: Gets data from before this date (inclusive)
raw: If true, data pulled from fundamental_raw table
Otherwise data pulled from fundamental table
Returns:
A Dictionary of DataFrames keyed by attributes
"""
# Query database for rows
table_name = 'fundamental_raw' if raw else 'fundamental'
response_df = get_data(
table_name, tickers, start_date, end_date, attributes
)
# Covert to datetime
response_df['datekey'] = pd.to_datetime(response_df['datekey'])
# Check if response_df is None and throw error
if response_df is None:
raise TypeError("Response from query was None")
# Sort on date
response_df.sort_values(['datekey'], inplace=True)
# Set date as index
response_df.set_index('datekey', inplace=True)
# Get attributes that are not date and ticker columns
non_date_ticker_attr = response_df.drop(['ticker'], axis=1).columns
return create_attribute_dictionary(response_df, non_date_ticker_attr)
def get_fundamental_filled(
tickers: Optional[Union[str, array_1d]] = None,
start_date: Optional[date_obj] = None,
end_date: Optional[date_obj] = None,
attributes: Optional[array_1d] = None,
day_threshold: int = 365,
dates: Optional[array_1d] = None,
) -> Dict[str, pd.DataFrame]:
"""Pulls data from fundamental_raw table by first finding the most recent
data, and then forward filling any missing data
Args:
tickers: Gets data for only this ticker or list of tickers
start_date: Gets data from after this date (inclusive)
end_date: Gets data from before this date (inclusive)
attributes: Gets data for columns corresponding to list of attributes
day_threshold: threshold for recent data. Default is 365 days
If data is not found in that range for a given
ticker, that ticker's fill data wil be replaced with NaN
dates: Dates to reindex data for. Default is None, no resampling will be
done. Otherwise, if dates are provided, output will be processed
by resample_index_dict before returning
Returns:
A Dictionary of DataFrames keyed by attributes
"""
# Get initial clean data
initial_data = get_fundamental_data_day(
tickers, start_date, attributes, day_threshold=day_threshold
)
# Make tickers columns and attributes as the index
initial_data.index = initial_data['ticker']
initial_data.drop('ticker', axis=1, inplace=True)
initial_data = initial_data.transpose()
# Get the rest of the data
fund_data = get_fundamental_data(tickers, start_date, end_date, attributes)
# If clean data was not found for certain tickers, add NaN in their place
initial_tickers = set(initial_data.columns)
fund_tickers = set(fund_data[list(fund_data.keys())[0]].columns)
missing_tickers = fund_tickers - initial_tickers
missing_data = np.full(
(initial_data.shape[0], len(missing_tickers)), np.nan
)
missing_df = pd.DataFrame(
missing_data, columns=missing_tickers, index=initial_data.index
)
# Construct full initial dataframe with NaNs from the missing tickers
full_initial = pd.concat([initial_data, missing_df], axis=1)
full_initial = full_initial.reindex(sorted(full_initial.columns), axis=1)
# For each attribute, set the first day as the clean initial data for all
# tickers, forward fill, and then reindex using the original index
for attr in attributes:
idx = fund_data[attr].index
cols = fund_data[attr].columns
fund_data[attr].loc[start_date] = full_initial.loc[attr, cols]
fund_data[attr].sort_index(inplace=True)
fund_data[attr].fillna(method='ffill', inplace=True)
fund_data[attr] = fund_data[attr].loc[idx]
if dates is not None:
fund_data = resample_index_dict(fund_data, dates, True)
return fund_data
def get_equity_data(
tickers: Optional[Union[str, array_1d]] = None,
start_date: Optional[date_obj] = None,
end_date: Optional[date_obj] = None,
attributes: Optional[array_1d] = None,
raw: bool = True,
) -> Dict[str, pd.DataFrame]:
"""Gets data from equity table and returns a dictionary of DataFrame
keyed by attributes. Each of these DataFrame has tickers as columns and
rows as dates and values of the attribute for that day
If the ticker is a string, only data for that ticker will be queried
If the ticker is a array1d of strings, data for the tickers will be queried
If no arguments are specified, data from the entire equity table is returned
Args:
tickers: Gets data for only this ticker or list of tickers
attributes: Gets data for columns corresponding to list of attributes
start_date: Gets data from after this date (inclusive)
end_date: Gets data from before this date (inclusive)
raw: If true, data pulled from equity_raw table
Otherwise data pulled from equity table
Returns:
A Dictionary of DataFrames keyed by attributes.
"""
table_name = 'equity_raw' if raw else 'equity'
# Query database for rows
response_df = get_data(
table_name, tickers, start_date, end_date, attributes
)
response_df['date'] = pd.to_datetime(response_df['date'])
# Check if response_df is None and throw error
if response_df is None:
raise TypeError("Response from query was None")
# Sort on date
response_df.sort_values(['date'], inplace=True)
# Set date as index
response_df.set_index('date', inplace=True)
# Get attributes that are not date and ticker columns
non_date_ticker_attr = response_df.drop(['ticker'], axis=1).columns
return create_attribute_dictionary(response_df, non_date_ticker_attr)
def create_attribute_dictionary(
df: pd.DataFrame, attributes: array_1d
) -> Dict[str, pd.DataFrame]:
"""Takes Dataframe with attributes as columns and returns a dictionary of
DataFrames. The DataFrames in the dictionary have specified index (e.g.
date) and specified columns (e.g. tickers).
Args:
df: Pandas DataFrame with attributes in the the columns
attributes: list of attributes for keys in the dictionary,
these must be columns in the DataFrame
Returns:
Dictionary of DataFrames keyed by attribute whose index and
columns were given as parameters and values are the attribute value.
"""
df_dict = {}
df = df.drop_duplicates() # somehow duplicate rows appear??
# split up by attribute, shape it so tickers are columns
for attr in attributes:
attr_df = df[['ticker', attr]].pivot(columns='ticker', values=attr)
df_dict[attr] = attr_df.reindex(
sorted(attr_df.columns), axis=1
) # sort by columns by ticker symbol
return df_dict
Earnings Before Interest, Taxes, Depreciation, and Amortization ('ebitda'): This indicator measures the overall financial performance, or profitability, of a company. It is oftentimes more accurate an indicator because it represents earnings before accounting and financial deductions.
$EBITDA = NET INCOME + INTEREST + TAXES + DEPRECIATION + AMORTIZATION$
Earnings per Share ('eps'): This indicator measures how much money a company makes for each share of its stock, an indication of corporate value. The larger the Earnings per Share, the more profitable the company is.
$Earnings per Share = \frac{Net Income - Preferred Dividends}{End-of-Period Common Shares Outstanding}$
Enterprise Value ('ev'): This indicator measures the enterprise value of a company, including market capitalization adjusted for short-term and long-term debts as well as any cash on the balance sheet. This is often viewed as the "selling price" that a company would go for.
$Enterprise Value = Market Capitalization + Total Debt - Cash$
Gross Profit Margin ('grossmargin') This indicator measures the portion of each dollar of revenue that a company retains as gross profit.
$Gross Profit = Sales - Cost of Goods Sold$
Price to Earnings Ratio ('pe'): This indicator shows the current share price relative to the Earnings Per Share of a company. Investors use this ratio to determine the relative value of a company's shares.
$P/E = \frac{Market Value per Share}{Earnings per Share}$
Research and Development ('rnd'): This indicator measures a company's activities in driving innovation and development of products to produce new products or services.
Opening Price ('open'): The opening price for a stock is the price exactly at the opening of trading in the New York Stock Exchange at 9:30am EST.
High Price ('high'): The high price for a stock is the peak intraday price of a stock.
Low Price ('low'): The low price for a stock is the lowest intraday price of a stock.
Adjusted Closing Price ('close'): The adjusted closing price is the closing price of a stock adjusted for stock splits, dividends, and rights offerings.
Volume ('volume'): The amount of stocks that change hands over the course of a day for a given company.
Dividends ('dividends'): The distribution a portion of a company's earnings to preferred stockholders.
Unadjusted Closing price ('closeunadj'): The close price for a stock is the price exactly at the close of the New York Stock Exchange at 4:00pm EST.
def extract_data(stocks, start_date, end_date):
"""Queries the PostgreSQL database for Equity and Financial data
for the given list of stocks between the start date and the end date
Args:
stocks: A list of tickers
start_date: A datetime object representing the start date
end_date: A datetime object representing the end date
Returns:
Two dataframes, the first one being equity data and the second
being the fundamental data
"""
fundamental = get_fundamental_filled(stocks, start_date, end_date,
['ebitda',
'eps',
'ev',
'grossmargin',
'pe',
'rnd',
'marketcap'])
equity_data = get_equity_data(stocks, start_date, end_date)
return equity_data, fundamental
#Extract Equity Data and Fundamental Data for each ticker in the S&P500
equity_data, fundamental_data = extract_data(tickers, start_date, end_date)
#Create Dataframes for each variable of interest where columns correspond to the ticker and rows correspond to the date
#EQUITY
close_data = equity_data['close']
#FUNDAMENTAL
ebitda_data = fundamental_data['ebitda']
eps_data = fundamental_data['eps']
ev_data = fundamental_data['ev']
gm_data = fundamental_data['grossmargin']
pe_data = fundamental_data['pe']
rnd_data = fundamental_data['rnd']
mc_data = fundamental_data['marketcap']
ebitda_data.index.names = ['date']
eps_data.index.names = ['date']
ev_data.index.names = ['date']
gm_data.index.names = ['date']
pe_data.index.names = ['date']
rnd_data.index.names = ['date']
mc_data.index.names = ['date']
print(f' Number of stocks in close_data: {close_data.shape[1]}')
print(f' Number of stocks in ev_data: {ev_data.shape[1]}')
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
# For the list of tickers in the S&P500, drop tickers that aren't present in the fundamental data
for ind, x in stock_info['Ticker'].items():
if x not in ev_data.columns:
stock_info = stock_info[stock_info.Ticker != x]
#For each ticker in the close price dataframe, drop tickers that aren't represent in the fundamental data
for x in close_data.columns:
if x not in ev_data.columns:
close_data.drop([x], axis=1, inplace=True)
stock_info.reset_index(inplace = True, drop = True)
print(f' Number of stocks in close_data: {close_data.shape[1]}')
print(f' Number of stocks in ev_data: {ev_data.shape[1]}')
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
print(f'NaN data in stock_info: {stock_info.isna().sum().sum()}')
print(f'NaN data in close_data: {close_data.isna().sum().sum()}')
print(f'NaN data in ebitda_data: {ebitda_data.isna().sum().sum()}')
print(f'NaN data in gm_data: {gm_data.isna().sum().sum()}')
print(f'NaN data in pe_data: {pe_data.isna().sum().sum()}')
print(f'NaN data in eps_data: {eps_data.isna().sum().sum()}')
print(f'NaN data in ev_data: {ev_data.isna().sum().sum()}')
print(f'NaN data in rnd_data: {rnd_data.isna().sum().sum()}')
print(f'NaN data in mc_data: {mc_data.isna().sum().sum()}')
close_data.fillna(method='ffill', inplace=True)
close_data.fillna(method='bfill', inplace=True)
ebitda_data.fillna(method='ffill', inplace=True)
ebitda_data.fillna(method='bfill', inplace=True)
eps_data.fillna(method='ffill', inplace=True)
eps_data.fillna(method='bfill', inplace=True)
ev_data.fillna(method='ffill', inplace=True)
ev_data.fillna(method='bfill', inplace=True)
gm_data.fillna(method='ffill', inplace=True)
gm_data.fillna(method='bfill', inplace=True)
pe_data.fillna(method='ffill', inplace=True)
pe_data.fillna(method='bfill', inplace=True)
pe_data.dropna(axis=1, inplace=True)
rnd_data.fillna(method='ffill', inplace=True)
rnd_data.fillna(method='bfill', inplace=True)
mc_data.fillna(method='ffill', inplace=True)
mc_data.fillna(method='bfill', inplace=True)
print(f'NaN data in stock_info: {stock_info.isna().sum().sum()}')
print(f'NaN data in close_data: {close_data.isna().sum().sum()}')
print(f'NaN data in ebitda_data: {ebitda_data.isna().sum().sum()}')
print(f'NaN data in gm_data: {gm_data.isna().sum().sum()}')
print(f'NaN data in pe_data: {pe_data.isna().sum().sum()}')
print(f'NaN data in eps_data: {eps_data.isna().sum().sum()}')
print(f'NaN data in ev_data: {ev_data.isna().sum().sum()}')
print(f'NaN data in rnd_data: {rnd_data.isna().sum().sum()}')
print(f'NaN data in mc_data: {mc_data.isna().sum().sum()}')
close_data.head(10)
close_data = close_data.resample('D').ffill().reset_index()
close_data.set_index('date', inplace=True)
ebitda_data = ebitda_data.resample('D').ffill()
eps_data = eps_data.resample('D').ffill()
ev_data = ev_data.resample('D').ffill()
gm_data = gm_data.resample('D').ffill()
pe_data = pe_data.resample('D').ffill()
rnd_data = rnd_data.resample('D').ffill()
mc_data = mc_data.resample('D').ffill()
close_data.head(10)
def get_sp_close(start_date: date_obj, end_date: date_obj) -> pd.Series:
"""temporary function to get S&P500 closing prices
Args:
start_date: Beginning of date range for data
end_date: End of date range for data
Returns:
Series of close prices
"""
api_key = '24XHLRJWJKXJBSSA'
base_url = (
'https://www.alphavantage.co/query?function='
'TIME_SERIES_DAILY_ADJUSTED&symbol='
)
functions = '&symbol=SPY&apikey={}&datatype=csv&outputsize=full'.format(
api_key
)
full_url = base_url + functions
sp500_data = pd.read_csv(full_url)[
['timestamp', 'adjusted_close', 'dividend_amount']
]
sp500_data.rename(
columns={
'timestamp': 'date',
'adjusted_close': 'close',
'dividend_amount': 'dividend',
},
inplace=True,
)
sp500_data['date'] = pd.to_datetime(sp500_data['date'])
sp500_data.sort_values('date', inplace=True)
sp500_data = sp500_data[
(sp500_data['date'] >= start_date) & (sp500_data['date'] <= end_date)
]
sp500_data.index = sp500_data['date']
return sp500_data['close']
# Get the S&P500 closing price
sp_data = get_sp_close(start_date, end_date)
sp_data = sp_data.resample('D').ffill().reset_index()
sp_data.set_index('date', inplace=True)
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(sp_data.index, sp_data, label = 'Daily SPY')
ax.plot(sp_data.rolling(10).mean(), label = '10 Day Rolling Mean')
ax.plot(sp_data.rolling(30).mean(), label = '30 Day Rolling Mean')
ax.plot(sp_data.rolling(100).mean(), label = '100 Day Rolling Mean')
ax.set_title('S&P500 Closing Price')
ax.set_xlabel('Date')
ax.set_ylabel('Closing price (in USD)')
ax.legend()
fig.show()
print('S&P500 returns between April 1, 2019 and December 1, 2019: ' + str((sp_data['close'][-1]-sp_data['close'][0])/sp_data['close'][0]*100)+ '%')
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(sp_data.index, stats.zscore(sp_data), label = 'SPY')
ax.set_title("Z-scored Clsoing Price for S&P500")
ax.set_ylabel("Standard Deviations away from mean closing price")
ax.set_xlabel("Date")
plt.show()
sp_data.pct_change().plot.hist(bins = 60)
plt.xlabel("Daily returns %")
plt.ylabel("Frequency")
plt.title("S&P500 daily returns data")
plt.text(-0.03,100,"Extremely Low\nreturns")
plt.text(0.02,100,"Extremely High\nreturns")
plt.show()
top_20_ebitda = ebitda_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_ebitda.index,top_20_ebitda.to_numpy())
ax.set_title('Companies with highest EBIDTA')
fig.show()
top_20_eps = eps_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_eps.index,top_20_eps.to_numpy())
ax.set_title('Companies with highest Earnings per Share')
fig.show()
top_20_pe = pe_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_pe.index,top_20_pe.to_numpy())
ax.set_title('Companies with highest Price to Earnings Ratio')
fig.show()
top_20_rnd = rnd_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_rnd.index,top_20_rnd.to_numpy())
ax.set_title('Companies with highest Research and Development Costs')
fig.show()
top_20_ev = ev_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_ev.index,top_20_ev.to_numpy())
ax.set_title('Companies with highest Enterprise Value')
fig.show()
frequencies = stock_info.groupby('Sector').size()
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(frequencies, labels=frequencies.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
#Calculate the total market cap of the top 10 companies in the S&P500
top_10_mc = mc_data.iloc[-1,].sort_values(ascending=False)[0:10].sum()
#Calculate total market cap of the S&P500
all_mc = mc_data.iloc[-1,].sort_values(ascending=False).sum()
print("\nPercent of Total S&P500 Market Capitalization Represented by top 10 stocks:")
print(top_10_mc/all_mc * 100)
top_20_mc = mc_data.iloc[-1].sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_mc.index,top_20_mc.to_numpy())
ax.set_title("S&P500 Stocks with Largest Market Capitalization")
ax.set_ylabel('Market Capitalization')
ax.set_xlabel('Tickers')
fig.show()
average_mc = mc_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
mc_sectors = frequencies.copy()
for i, row in mc_sectors.items():
mc_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_mc.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in mc_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
mc_sectors[k] += row2
for i, val in mc_sectors.items():
mc_sectors[i] /= frequencies[i]
mc_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(mc_sectors, labels=mc_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_ev = ev_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
ev_sectors = frequencies.copy()
for i, row in ev_sectors.items():
ev_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_ev.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in ev_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
ev_sectors[k] += row2
for i, val in ev_sectors.items():
ev_sectors[i] /= frequencies[i]
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(ev_sectors, labels=ev_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_ebitda = ebitda_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
ebitda_sectors = frequencies.copy()
for i, row in ebitda_sectors.items():
ebitda_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_ebitda.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in ebitda_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
ebitda_sectors[k] += row2
for i, val in ebitda_sectors.items():
ebitda_sectors[i] /= frequencies[i]
ebitda_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(ebitda_sectors, labels=ebitda_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_pe = pe_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
pe_sectors = frequencies.copy()
for i, row in pe_sectors.items():
pe_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_pe.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in pe_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
pe_sectors[k] += row2
for i, val in pe_sectors.items():
pe_sectors[i] /= frequencies[i]
pe_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(pe_sectors, labels=pe_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_eps = eps_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
eps_sectors = frequencies.copy().astype('float')
for i, row in eps_sectors.items():
eps_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_eps.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in eps_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
eps_sectors[k] += row2
for i, val in eps_sectors.items():
eps_sectors[i] = float(eps_sectors[i]/frequencies[i])
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(eps_sectors, labels=eps_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_rnd = rnd_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
rnd_sectors = frequencies.copy()
for i, row in rnd_sectors.items():
rnd_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_rnd.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in rnd_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
rnd_sectors[k] += row2
for i, val in rnd_sectors.items():
rnd_sectors[i] /= frequencies[i]
rnd_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(rnd_sectors, labels=rnd_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
frame = { 'close': close_data['AAPL'][:-2],
'ebidta': ebitda_data['AAPL'],
'eps': eps_data['AAPL'],
'ev': ev_data['AAPL'],
'gm': gm_data['AAPL'],
'pe': pe_data['AAPL'],
'rnd': rnd_data['AAPL'],
'mc': mc_data['AAPL']
}
AAPL = pd.DataFrame(frame)
AAPL.head()
#Fit a linear regression model using OLS in statsmodels (SciKit-Learn does not display summary statistics like p-value)
est = sm.OLS(AAPL['close'], AAPL[['ebidta', 'eps', 'ev', 'gm', 'pe', 'rnd', 'mc']])
est2 = est.fit()
print(est2.summary()) #Print summary statistics


def predict_auto_arima(data, target, columns, split, days_ahead, verbose = True):
sbn.set(font_scale=1.2)
predicted_returns = {}
datasets = []
count = 1
for col in columns:
print(f'Processing ticker {count}')
print(col)
print('\n')
X = data[target][col]
X = X.dropna()
differenced = X.diff()
datasets.append(differenced)
if verbose:
#Plot the Time Series for close data
plt.figure(figsize=(10,10))
plt.plot(X)
plt.title(col)
plt.ylabel(str(target))
plt.xlabel('Date')
plt.show()
#Check if the data is Stationary with Augmented Dicky-Fuller Test
print("Is the Data Stationary? (Augmented Dicky-Fuller Test)")
adf_test = ADFTest(alpha = 0.05)
dicky_res = adf_test.should_diff(X)
print('p-value: '+str(dicky_res[0]))
print('The Time Series is Stationary? '+str(False if dicky_res[1] else True))
print("\n")
train, test = X[0:int(len(X)*split)], X[int(len(X)*split):] #Create train/test split
test_set_range = test.index
train_set_range = train.index
if verbose:
#Plot the time series and denote the training period and the testing period
plt.figure(figsize=(10,10))
plt.plot(train_set_range, train, label = 'train')
plt.plot(test_set_range, test, label = 'test')
plt.title(col)
plt.xlabel('Date')
plt.ylabel(str(target))
plt.legend()
plt.show()
#Plot the ACF graph
plt.figure(figsize=(10,10))
sm.graphics.tsa.plot_acf(X.values.squeeze(), lags=40)
#plt.show()
#Plot the PACF graph
plt.figure(figsize=(10,10))
sm.graphics.tsa.plot_pacf(X.values.squeeze(), lags=40)
#plt.show()
#Automatically fit an ARIMA model to the time series
n_diffs = 2
trace = False
if verbose:
trace = True
else:
trace = False
auto = pm.auto_arima(train, seasonal=False, stepwise=True,
suppress_warnings=True, error_action="ignore", max_p=10, max_q = 10, max_d = 10,
max_order=None, random_state = 9, n_fits = 50, trace=trace)
if verbose:
print(auto.summary())
#Create a dataframe of predicted close price for the period
df = pd.DataFrame()
df["date"] = pd.date_range(test.index[0], periods=days_ahead, freq="D")
df = df.set_index(df["date"])
prediction = pd.DataFrame(auto.predict(n_periods = days_ahead), index = df.index)
prediction.columns = ['predicted_close']
if verbose:
#plot the n-day prediction against the testing period
plt.figure(figsize=(10,10))
plt.plot(train, label = "train")
plt.plot(test, label="test")
plt.plot(prediction, label = "predicted")
plt.title(col)
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()
pred_price = prediction.iloc[-1][0]
print(prediction)
bought_price = train.iloc[-1]
print(bought_price)
print("Predicted Return:")
pred_ret = (pred_price-bought_price)/bought_price
predicted_returns[col] = prediction
print(pred_ret)
print("\n")
print("Actual Return:")
try:
print((test.iloc[days_ahead] - bought_price)/bought_price)
except:
print("Date for Prediction not in Testing Period")
print("\n")
#Calculate a Rolling Daily Prediction using ARIMA and calculate Mean Squared Error
train_data, test_data = X[0:int(len(X)*split)], X[int(len(X)*split):]
training_data = train_data.values
test_data = test_data.values
history = [x for x in training_data]
model_predictions = []
N_test_observations = len(test_data)
for time_point in range(N_test_observations):
model = ARIMA(history, order=(4,2,0))
model_fit = model.fit(disp=2)
output = model_fit.forecast()
yhat = output[0]
model_predictions.append(yhat)
true_test_value = test_data[time_point]
history.append(true_test_value)
MSE_error = mean_squared_error(test_data, model_predictions)
if verbose:
print('Testing Mean Squared Error is {}'.format(MSE_error))
test_set_range = X[int(len(X)*split):].index
if verbose:
#Plot the rolling predictions during the testing period
plt.figure(figsize=(10,10))
plt.plot(test_set_range, model_predictions, color='blue', linestyle='dashed',label='Predicted Price')
plt.plot(test_set_range, test_data, color='red', label='Actual Price')
plt.title(str(col) + ' Closing Price Prediction')
plt.xlabel('Date')
plt.ylabel('Prices')
plt.legend()
plt.show()
count += 1
return predicted_returns, datasets
a, b = predict_auto_arima(equity_data, 'close', ['WMT', 'JPM','BRK.B'], 0.7, 20, verbose=True)

#Create labels for each stock
labs = pd.DataFrame({'Names' : close_data.columns})
#Since this is an incredibly costly algorithm, we will cluster stocks together based on the last 20 days of the time period
latter = close_data.iloc[-4:]
#Compute pairwise Dynamic Time Warping Distance between each pair of stocks in the S&P500, store it as a "correlation matrix"
distance_matrix = []
count = 0
for x in latter.columns:
count += 1
distance_matrix.append([])
for y in latter.columns:
distance, path = fastdtw(stats.zscore(latter[x]), stats.zscore(latter[y]), dist=euclidean)
distance_matrix[count-1].append(distance)
#Convert the Matrix into a Dataframe:
matrix = pd.DataFrame(distance_matrix, index = close_data.columns, columns = close_data.columns)
N = matrix.shape[0]
df = matrix
def plot_corr(df):
corr = df.corr()
# Plot the correlation matrix
fig, ax = plt.subplots(figsize=(20,20))
sbn.heatmap(corr.to_numpy(), xticklabels = False, yticklabels = False, ax=ax, cmap="RdYlGn")
plt.title('Dynamic Time-Warping Distance')
fig.show()
plot_corr(df)
#Perform a heirarchical clustering of the matrix and replot the heatmap for the clusters found
X = df.corr().values
d = sch.distance.pdist(X)
L = sch.linkage(X, method='complete')
ind = sch.fcluster(L, 0.5*d.max(), 'distance')
columns = [df.columns.tolist()[i] for i in list((np.argsort(ind)))]
df = df.reindex(columns, axis=1)
plot_corr(df)
labels = df.index.to_numpy()
p = len(labels)
plt.figure(figsize=(30,30))
plt.title('Hierarchical Clustering Dendrogram (truncated)', fontsize=20)
plt.xlabel('Ticker', fontsize=16)
plt.ylabel('distance', fontsize=16)
# call dendrogram to get the returned dictionary
# (plotting parameters can be ignored at this point)
R = dendrogram(
L,
truncate_mode='lastp', # show only the last p merged clusters
p=p, # show only the last p merged clusters
no_plot=False,
)
# create a label dictionary
temp = {R["leaves"][ii]: labels[ii] for ii in range(len(R["leaves"]))}
def llf(xx):
return "{}".format(temp[xx])
dendrogram(
L,
truncate_mode='lastp', # show only the last p merged clusters
p=p, # show only the last p merged clusters
leaf_label_func=llf,
leaf_rotation=60.,
leaf_font_size=7.,
show_contracted=True, # to get a distribution impression in truncated branches
)
plt.show()
def give_cluster_assigns(df, numclust, transpose=True):
clusters = {}
if transpose==True:
data_dist = pdist(df.transpose())
data_link = linkage(data_dist, metric='correlation', method='complete')
cluster_assigns=pd.Series(sch.fcluster(data_link, numclust, criterion='maxclust'), index=df.columns)
else:
data_dist = pdist(df)
data_link = linkage(data_dist, metric='correlation', method='complete')
cluster_assigns=pd.Series(sch.fcluster(data_link, numclust, criterion='maxclust'), index=df.index)
for i in range(1,numclust+1):
print("Cluster ",str(i),": ( N =",len(cluster_assigns[cluster_assigns==i].index),")", ", ".join(list(cluster_assigns[cluster_assigns==i].index)))
clusters[i] = list(cluster_assigns[cluster_assigns==i].index)
return cluster_assigns, clusters
gg, hh = give_cluster_assigns(df, 6, True)
for j in range(1,7):
print(f'Cluster #{j}')
cluster1 = []
cluster1_industry = []
for i, x in gg.items():
if x == j:
cluster1.append(i)
for x in cluster1:
for a, b in stock_info.iterrows():
if b['Ticker'] == x:
cluster1_industry.append(stock_info.iloc[a]['Sector'])
cluster1_industry = pd.Series(cluster1_industry)
frequencies = cluster1_industry.groupby(cluster1_industry).size()
fig1, ax1 = plt.subplots(figsize=(15, 15))
ax1.pie(frequencies, labels=frequencies.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
print('Average distance between stocks in the S&P500: ' + str(np.mean(distance_matrix)))
print('Standard Deviation of distance between stocks in the S&P500: ' + str(np.std(distance_matrix)))
graph = np.where(distance_matrix <= np.mean(distance_matrix)-np.std(distance_matrix), 1, 0)
G = nx.from_numpy_matrix(np.array(graph))
nx.draw(G, with_labels=True, labels=dict(labs['Names']))
import itertools
k = 6
comp = nx.algorithms.community.centrality.girvan_newman(G)
for communities in itertools.islice(comp, k):
print(tuple(sorted(c) for c in communities))
graph_vals = pd.DataFrame({'Eigenvector Centrality' : list(nx.algorithms.centrality.eigenvector_centrality(G).values()),
'Closeness Centrality': list(nx.algorithms.centrality.closeness_centrality(G).values()),
'Betweenness Centrality': list(nx.algorithms.centrality.betweenness_centrality(G).values()),
'Degree Centrality': list(nx.algorithms.centrality.degree_centrality(G).values())}, index = list(nx.algorithms.centrality.degree_centrality(G).keys()))
graph_vals.head()
graph_vals.set_index(labs['Names'], inplace=True)
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Closeness Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Closeness Centrality')
plt.ylabel('Closeness Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(20,8))
top_centralities = graph_vals['Betweenness Centrality'].sort_values(ascending=False)[0:15]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Betweenness Centrality')
plt.ylabel('Betweenness Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Degree Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Degree Centrality')
plt.ylabel('Degree Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Eigenvector Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Eigenvector Centrality')
plt.ylabel('Eigenvector Centrality')
plt.xlabel('Ticker')
fig.show()
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers2 = tickers
tickers2.append('BRK-B') #Format for these tickers is different on Finviz
tickers2.append('BF-B')
for ticker in tickers2:
try:
url = finwiz_url + ticker #Construct a url to query
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) #Send a GET request
response = urlopen(req)
# Read the contents of the file into 'html'
html = BeautifulSoup(response)
# Find 'news-table' in the Soup and load it into 'news_table'
news_table = html.find(id='news-table')
# Add the table to our dictionary
news_tables[ticker] = news_table
except:
continue
# Read one single day of headlines for 'AMZN'
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into 'amzn_tr'
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
# Read the text of the element 'a' into 'link_text'
a_text = table_row.a.text
# Read the text of the element 'td' into 'data_text'
td_text = table_row.td.text
# Print the contents of 'link_text' and 'data_text'
print(a_text)
print(td_text)
# Exit after printing 4 rows of data
if i == 3:
break
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
# Iterate through all tr tags in 'news_table'
for x in news_table.findAll('tr'):
# read the text from each tr tag into text
# get text from a only
text = x.a.get_text()
# splite text in the td tag into a list
date_scrape = x.td.text.split()
# if the length of 'date_scrape' is 1, load 'time' as the only element
if len(date_scrape) == 1:
time = date_scrape[0]
# else load 'date' as the 1st element and 'time' as the second
else:
date = date_scrape[0]
time = date_scrape[1]
# Extract the ticker from the file name, get the string up to the 1st '_'
ticker = file_name.split('_')[0]
# Append ticker, date, time and headline as a list to the 'parsed_news' list
parsed_news.append([ticker, date, time, text])
import nltk
nltk.download('vader_lexicon')
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()
stocks = []
sentiments = []
dd = parsed_and_scored_news.groupby(['ticker'])
for key, value in dd:
if key in gg.index:
stocks.append(key)
sentiments.append(value['compound'].mean())
sentiment_info = pd.DataFrame({'Ticker': stocks, 'Sentiment': sentiments}) #Create Dataframe
sentiment_info = sentiment_info.set_index('Ticker')
sentiment_info.head()
print('Average Sentiment for S&P500: '+str(sentiment_info['Sentiment'].mean()))
neg_sent = sentiment_info['Sentiment'].sort_values(ascending=True)[0:10]
plt.figure(figsize=(10,5))
plt.bar(neg_sent.index, neg_sent)
plt.title('Stocks in the S&P500 with lowest sentiment')
plt.show()
pos_sent = sentiment_info['Sentiment'].sort_values(ascending=False)[0:10]
plt.figure(figsize=(10,5))
plt.bar(neg_sent.index, neg_sent)
plt.title('Stocks in the S&P500 with lowest sentiment')
plt.show()
average_sentiment = {}
for i, x in stock_info.iterrows():
for j, y in sentiment_info.iterrows():
if j == x['Ticker']:
try:
average_sentiment[x['Sector']] += y['Sentiment']
except:
average_sentiment[x['Sector']] = y['Sentiment']
for i, x in stock_info.groupby(['Sector']).count()['Name'].items():
average_sentiment[i] /= x
average_sentiment
plt.figure(figsize=(20, 10))
plt.title('Sentiment by Sector')
plt.bar(average_sentiment.keys(), average_sentiment.values())
plt.xticks(rotation=65)
plt.show()
average_sentiment = {}
for i, x in gg.items():
for j, y in sentiment_info.iterrows():
if j == i:
try:
average_sentiment[x] += y['Sentiment']
except:
average_sentiment[x] = y['Sentiment']
for x in average_sentiment.keys():
average_sentiment[x] /= len(hh[x])
average_sentiment
plt.figure(figsize=(20, 10))
plt.title('Sentiment by Cluster')
plt.bar(average_sentiment.keys(), average_sentiment.values())
plt.show()